2.0 Country City Lookup

In [1]:
import pandas as pd
import numpy as np
from copy import deepcopy
In [2]:
from clustergrammer2 import net
df = {}
clustergrammer2 backend version 0.2.9
In [3]:
import clustergrammer_groupby as cby
In [4]:
# load json to dict
def load_to_dict( filename ):
  import json
  # load
  f = open(filename,'r')
  inst_dict = json.load(f)
  f.close()
  return inst_dict

# save dict to json
def save_to_json(inst_dict, filename, indent=True):
  import json

  # save as a json
  fw = open(filename, 'w')
  if indent == True:
    fw.write( json.dumps(inst_dict, indent=2) )
  else:
    fw.write( json.dumps(inst_dict) )
  fw.close()
In [5]:
ad1 = load_to_dict('../big_data/address_1k.json')
ad2 = load_to_dict('../big_data/address_2k.json')
In [6]:
address_dict = {}
for inst_row in ad1:
    address_dict[inst_row] = ad1[inst_row]
    
for inst_row in ad2:
    address_dict[inst_row] = ad2[inst_row]
In [7]:
save_to_json(filename='../big_data/address_dict.json', inst_dict=address_dict)
In [8]:
len(list(address_dict.keys()))
Out[8]:
3456
In [9]:
df['ini'] = pd.read_csv('../challenge_data/dvs_challenge_1_membership_time_space.csv')

Collect Addresses, Lat, Longtude

In [10]:
country_dict = {}
city_dict = {}
lat_dict = {}
lng_dict = {}
for inst_row in df['ini'].index.tolist():
    
    lat_dict[str(inst_row)] = df['ini'].loc[inst_row]['lat']
    lng_dict[str(inst_row)] = df['ini'].loc[inst_row]['long']
    
    inst_row = str(inst_row)
    if str(inst_row) in address_dict:
        inst_address = address_dict[inst_row].split(', ')        
        inst_country = inst_address[-1]
        try:
            inst_city = inst_address[-4]
        except:
            inst_city = 'N.A.'
        
    else:
        inst_country = 'N.A.'
        inst_city = 'N.A.'
        
    

    country_dict[inst_row] = inst_country
    city_dict[inst_row] = inst_city
In [11]:
df['ini'].head()
Out[11]:
lat long data visualization society date_with_hour date hour
0 19.075984 72.877656 3.666667 3.333333 2.666667 2/20/2019 12 2/20/2019 12
1 43.653226 -79.383184 3.333333 3.000000 3.333333 2/20/2019 12 2/20/2019 12
2 39.739236 -104.990251 3.000000 1.666667 1.666667 2/20/2019 12 2/20/2019 12
3 60.169856 24.938379 2.000000 3.666667 2.333333 2/20/2019 12 2/20/2019 12
4 38.907192 -77.036871 2.333333 4.000000 2.666667 2/20/2019 12 2/20/2019 12
In [12]:
df['clean'] = deepcopy(df['ini'])
df['clean'] = df['clean'].drop(['lat', 'long', 'date_with_hour', 'date'], axis=1)
df['clean'].shape
Out[12]:
(3515, 4)
In [13]:
df['clean'].head()
Out[13]:
data visualization society hour
0 3.666667 3.333333 2.666667 12
1 3.333333 3.000000 3.333333 12
2 3.000000 1.666667 1.666667 12
3 2.000000 3.666667 2.333333 12
4 2.333333 4.000000 2.666667 12

Add City Country Categories

In [14]:
df['cat'] = deepcopy(df['clean'].transpose())
cols = df['cat'].columns.tolist()
new_cols = [('P-' + str(x), 
             'Country: ' + country_dict[str(x)], 
             'City: ' + city_dict[str(x)],
             'Lat: ' + str(lat_dict[str(x)]),
             'Long: ' + str(lng_dict[str(x)])
            ) for x in cols]
df['cat'].columns = new_cols
In [15]:
rows = df['cat'].index.tolist()
In [16]:
mat = df['cat'].get_values().astype('float')
In [17]:
df['proc'] = pd.DataFrame(columns=new_cols, index=rows, data=mat)
In [18]:
cols = df['proc'].columns.tolist()
keep_cols = [x for x in cols if 'N.A.' not in x[1]]
print(len(cols), len(keep_cols))
df['prot'] = df['proc'][keep_cols]
3515 3456
In [19]:
df['proc'].head()
Out[19]:
(P-0, Country: India, City: Mumbai Suburban, Lat: 19.0759837, Long: 72.8776559) (P-1, Country: Canada, City: Toronto, Lat: 43.653226, Long: -79.3831843) (P-2, Country: USA, City: Denver County, Lat: 39.7392358, Long: -104.990251) (P-3, Country: Finland, City: Southern Finland, Lat: 60.1698557, Long: 24.9383791) (P-4, Country: USA, City: Washington, Lat: 38.9071923, Long: -77.0368707) (P-5, Country: Brazil, City: Rio Grande do Sul, Lat: -30.0346471, Long: -51.2176584) (P-6, Country: USA, City: Cook County, Lat: 41.8781136, Long: -87.6297982) (P-7, Country: USA, City: Washington, Lat: 38.9071923, Long: -77.0368707) (P-8, Country: United Kingdom, City: London, Lat: 51.5073509, Long: -0.1277583) (P-9, Country: India, City: Bangalore Urban, Lat: 12.9715987, Long: 77.5945627) ... (P-3505, Country: Canada, City: Halifax County, Lat: 44.6487635, Long: -63.5752387) (P-3506, Country: Chile, City: Provincia de Marga Marga, Lat: -33.0482707, Long: -71.4408752) (P-3507, Country: USA, City: Fulton County, Lat: 33.7489954, Long: -84.3879824) (P-3508, Country: USA, City: Los Angeles County, Lat: 34.0966764, Long: -117.7197785) (P-3509, Country: USA, City: Cook County, Lat: 41.8781136, Long: -87.6297982) (P-3510, Country: Luxembourg, City: Esch-sur-Alzette, Lat: 49.5008805, Long: 5.9860925) (P-3511, Country: USA, City: Washington, Lat: 38.9071923, Long: -77.0368707) (P-3512, Country: USA, City: Washington, Lat: 38.9071923, Long: -77.0368707) (P-3513, Country: USA, City: Harris County, Lat: 29.7604267, Long: -95.3698028) (P-3514, Country: USA, City: New Haven County, Lat: 41.308274, Long: -72.9278835)
data 3.666667 3.333333 3.000000 2.000000 2.333333 4.000000 4.000000 3.0 0.666667 3.000000 ... 3.0 2.666667 1.666667 1.666667 3.666667 2.000000 3.333333 1.333333 2.000000 1.666667
visualization 3.333333 3.000000 1.666667 3.666667 4.000000 2.000000 2.666667 4.0 1.666667 3.666667 ... 2.0 2.000000 4.333333 3.000000 3.333333 2.000000 1.000000 2.333333 4.000000 1.000000
society 2.666667 3.333333 1.666667 2.333333 2.666667 3.333333 3.000000 4.0 2.666667 1.666667 ... 0.0 1.666667 4.000000 2.333333 2.000000 2.333333 3.333333 2.666667 0.666667 0.000000
hour 12.000000 12.000000 12.000000 12.000000 12.000000 12.000000 12.000000 12.0 12.000000 12.000000 ... 9.0 10.000000 11.000000 11.000000 12.000000 12.000000 12.000000 12.000000 12.000000 12.000000

4 rows × 3515 columns

In [20]:
net.set_cat_color(axis='col', cat_index=1, cat_name='Country: USA', inst_color='blue')
net.set_cat_color(axis='col', cat_index=1, cat_name='Country: United Kingdom', inst_color='white')
net.set_cat_color(axis='col', cat_index=1, cat_name='Country: Canada', inst_color='red')
net.set_cat_color(axis='col', cat_index=1, cat_name='Country: India', inst_color='green')
net.set_cat_color(axis='col', cat_index=1, cat_name='Country: Australia', inst_color='black')
there was an error setting the category color
there was an error setting the category color
there was an error setting the category color
there was an error setting the category color
there was an error setting the category color

All Members

In [21]:
net.load_df(df['proc'])
net.swap_nan_for_zero()
net.normalize(axis='row', norm_type='zscore')
net.widget()
In [22]:
df_sig, keep_genes_dict, df_gene_pval, all_fold_info = cby.generate_signatures(df['proc'], category_level='Country')
/Users/nickfernandez/anaconda3/envs/py36lab/lib/python3.6/site-packages/numpy/core/fromnumeric.py:3146: RuntimeWarning: Degrees of freedom <= 0 for slice
  **kwargs)
/Users/nickfernandez/anaconda3/envs/py36lab/lib/python3.6/site-packages/numpy/core/_methods.py:125: RuntimeWarning: invalid value encountered in true_divide
  ret, rcount, out=ret, casting='unsafe', subok=False)
/Users/nickfernandez/anaconda3/envs/py36lab/lib/python3.6/site-packages/scipy/stats/_distn_infrastructure.py:879: RuntimeWarning: invalid value encountered in greater
  return (self.a < x) & (x < self.b)
/Users/nickfernandez/anaconda3/envs/py36lab/lib/python3.6/site-packages/scipy/stats/_distn_infrastructure.py:879: RuntimeWarning: invalid value encountered in less
  return (self.a < x) & (x < self.b)
/Users/nickfernandez/anaconda3/envs/py36lab/lib/python3.6/site-packages/scipy/stats/_distn_infrastructure.py:1821: RuntimeWarning: invalid value encountered in less_equal
  cond2 = cond0 & (x <= self.a)

Country Signatures

In [23]:
net.load_df(df_sig)
net.normalize(axis='row', norm_type='zscore')
net.widget()

U.S.A

In [24]:
net.set_cat_color(axis='col', cat_index=2, cat_name='City: New York City', inst_color='blue')
net.set_cat_color(axis='col', cat_index=1, cat_name='City: San Francisco and County', inst_color='white')
net.set_cat_color(axis='col', cat_index=1, cat_name='City: Washington', inst_color='red')
net.set_cat_color(axis='col', cat_index=1, cat_name='Country: India', inst_color='green')
In [25]:
net.load_df(df['proc'])
net.filter_cat(axis='col', cat_index=1, cat_name='Country: USA')
df['usa'] = net.export_df()
net.normalize(axis='row', norm_type='zscore')
net.widget()
In [26]:
df_sig, keep_genes_dict, df_gene_pval, all_fold_info = cby.generate_signatures(df['usa'], category_level='City')
/Users/nickfernandez/anaconda3/envs/py36lab/lib/python3.6/site-packages/numpy/core/fromnumeric.py:3146: RuntimeWarning: Degrees of freedom <= 0 for slice
  **kwargs)
/Users/nickfernandez/anaconda3/envs/py36lab/lib/python3.6/site-packages/numpy/core/_methods.py:125: RuntimeWarning: invalid value encountered in true_divide
  ret, rcount, out=ret, casting='unsafe', subok=False)
/Users/nickfernandez/anaconda3/envs/py36lab/lib/python3.6/site-packages/scipy/stats/_distn_infrastructure.py:879: RuntimeWarning: invalid value encountered in greater
  return (self.a < x) & (x < self.b)
/Users/nickfernandez/anaconda3/envs/py36lab/lib/python3.6/site-packages/scipy/stats/_distn_infrastructure.py:879: RuntimeWarning: invalid value encountered in less
  return (self.a < x) & (x < self.b)
/Users/nickfernandez/anaconda3/envs/py36lab/lib/python3.6/site-packages/scipy/stats/_distn_infrastructure.py:1821: RuntimeWarning: invalid value encountered in less_equal
  cond2 = cond0 & (x <= self.a)
In [27]:
df_sig.shape
Out[27]:
(4, 256)

U.S.A City Signatures

In [28]:
net.load_df(df_sig)
net.normalize(axis='row', norm_type='zscore')
net.widget()
In [ ]: